Package org.terrier.indexing

Source Code of org.terrier.indexing.MSExcelDocument

/*
* Terrier - Terabyte Retriever
* Webpage: http://terrier.org
* Contact: terrier{a.}dcs.gla.ac.uk
* University of Glasgow - School of Computing Science
* http://www.gla.ac.uk/
*
* The contents of this file are subject to the Mozilla Public License
* Version 1.1 (the "License"); you may not use this file except in
* compliance with the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS"
* basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
* the License for the specific language governing rights and limitations
* under the License.
*
* The Original Code is MSExcelDocument.java.
*
* The Original Code is Copyright (C) 2004-2011 the University of Glasgow.
* All Rights Reserved.
*
* Contributor(s):
*   Craig Macdonald <craigm{a.}dcs.gla.ac.uk> (original author)
*/
package org.terrier.indexing;


import java.io.CharArrayReader;
import java.io.CharArrayWriter;
import java.io.File;
import java.io.InputStream;
import java.io.Reader;
import java.util.Iterator;
import java.util.Map;

import org.apache.log4j.Logger;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.terrier.indexing.tokenisation.Tokeniser;
import org.terrier.utility.ApplicationSetup;

/** Implements a Document object for a Microsoft Excel spreadsheet.
*  Uses HSSF and POIFS subparts of the Jakarta-POI project. This means
*  that to use or compile this module, you must have the
*  poi-?.?.?-final-*.jar in your classpath. <p>
*  A bug in the current stable POI library seems to mean that large
*  Excel files cannot be parsed - see the MAXFILESIZE field to control
*  the maximum file size that this class will attempt to read.
@author Craig Macdonald <craigm{a.}dcs.gla.ac.uk>
*/
public class MSExcelDocument extends FileDocument
{
  protected static final Logger logger = Logger.getLogger(MSExcelDocument.class);
  /** Size of 1MB in bytes */
  protected static final int MEGABYTE = 1048576;

  /** Maximum file size that this class will attempt to open. Set to 0
    * to ignore. Set by propery <tt>indexing.excel.maxfilesize.mb</tt>,
    * default 0.5 */
  protected static final long MAXFILESIZE = (long)((float)MEGABYTE *
    Float.parseFloat(
      ApplicationSetup.getProperty("indexing.excel.maxfilesize.mb", "0.5")
      ));

  /** Construct a new MSExcelDocument Document object
    * @param filename the file that is opened for this
    * @param docStream the actual stream of the open file */
  public MSExcelDocument(String filename, InputStream docStream, Tokeniser tokeniser)
  {
    super(filename, docStream, tokeniser);
  }
  /**
   * Construct a new MSExcelDocument Document object
   * @param docStream
   * @param docProperties
   * @param tok
   */
  public MSExcelDocument(InputStream docStream,
      Map<String, String> docProperties, Tokeniser tok) {
    super(docStream, docProperties, tok);
  }
  /**
   * Construct a new MSExcelDocument Document object
   * @param docReader
   * @param docProperties
   * @param tok
   */
  public MSExcelDocument(Reader docReader, Map<String, String> docProperties,
      Tokeniser tok) {
    super(docReader, docProperties, tok);
  }
  /**
   * Construct a new MSExcelDocument Document object
   * @param filename
   * @param docReader
   * @param tok
   */
  public MSExcelDocument(String filename, Reader docReader, Tokeniser tok) {
    super(filename, docReader, tok);
  }
 
  /** Get the reader appropriate for this InputStream. This involves
    converting the Excel document to a stream of words. On failure
    returns null and sets EOD to true, so no terms can be read from
    the object.
    Uses the property <tt>indexing.excel.maxfilesize.mb</tt> to
    determine if the file is too big to open
    @param docStream */
  @SuppressWarnings("unchecked") //poi version used is for Java 1.4.
  protected Reader getReader(InputStream docStream)
  {
   
    if (MAXFILESIZE > 0 && (filename == null || new File(filename).length() > MAXFILESIZE))
    { 
     
      //logger.warn("WARNING: Excel document "+ filename+
//        " is too large for POI. Ignoring.");
      EOD = true;
      return null
    }
    try
    {
      CharArrayWriter writer = new CharArrayWriter();
      //opening the file system
      POIFSFileSystem fs = new POIFSFileSystem(docStream);
      //opening the work book
      HSSFWorkbook workbook = new HSSFWorkbook(fs);
     
      for (int i = 0; i < workbook.getNumberOfSheets(); i++ )
      {
        //got the i-th sheet from the work book
        HSSFSheet sheet = workbook.getSheetAt(i);
       
        Iterator rows = sheet.rowIterator();
        while( rows.hasNext() ) {
         
          HSSFRow row = (HSSFRow) rows.next();
          Iterator cells = row.cellIterator();
          while( cells.hasNext() ) {
            HSSFCell cell = (HSSFCell) cells.next();
            switch ( cell.getCellType() ) {
              case HSSFCell.CELL_TYPE_NUMERIC:
                String num = Double.toString(cell.getNumericCellValue()).trim();
                if(num.length() > 0) {
                  writer.write(num + " ");
                }
                break;
              case HSSFCell.CELL_TYPE_STRING:
                String text = cell.getStringCellValue().trim();
                if(text.length() > 0) {
                  writer.write(text + " ");
                }
                break;
            }
          }
        }
      }
      return new CharArrayReader(writer.toCharArray());
    }
    catch(Exception e )
    {
      //logger.warn("WARNING: Problem converting excel document"+e);
      EOD = true;
      return null;
    }
  }
}
TOP

Related Classes of org.terrier.indexing.MSExcelDocument

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.